coronavirusis an R package created and curated by Ramin Krispin and Jerret Byrnes and has been uploaded to the CRAN repository. Currently, it contains two data sets, coronavirus & covid19_vaccine, that are updated daily and have data on cases, deaths, vaccinations, population, and countries among other geospatial data. Notice that the coronavirus data and coronavirus package share the same name. You’ll be able to distinguish the package in R Studio via the :: in coronavirus:: when typed within an R chunk.
Let’s begin by installing the package and a few tools with the install.packages() & library() functions
# Use quotes "" for installing packages
#install.packages("tidyverse")
#install.packages("plotly")
#install.packages("DT")
#install.packages("coronavirus")
# Load them in
library(tidyverse)
library(plotly) # interactive maps and charts
library(DT) # interactive data tables
library(coronavirus)
#coronavirus
head(coronavirus)
## date province country lat long type cases uid iso2 iso3
## 1 2020-01-22 Alberta Canada 53.9333 -116.5765 confirmed 0 12401 CA CAN
## 2 2020-01-23 Alberta Canada 53.9333 -116.5765 confirmed 0 12401 CA CAN
## 3 2020-01-24 Alberta Canada 53.9333 -116.5765 confirmed 0 12401 CA CAN
## 4 2020-01-25 Alberta Canada 53.9333 -116.5765 confirmed 0 12401 CA CAN
## 5 2020-01-26 Alberta Canada 53.9333 -116.5765 confirmed 0 12401 CA CAN
## 6 2020-01-27 Alberta Canada 53.9333 -116.5765 confirmed 0 12401 CA CAN
## code3 combined_key population continent_name continent_code
## 1 124 Alberta, Canada 4413146 North America NA
## 2 124 Alberta, Canada 4413146 North America NA
## 3 124 Alberta, Canada 4413146 North America NA
## 4 124 Alberta, Canada 4413146 North America NA
## 5 124 Alberta, Canada 4413146 North America NA
## 6 124 Alberta, Canada 4413146 North America NA
tail(coronavirus)
## date province country lat long type cases uid iso2
## 624907 2022-02-06 <NA> Zimbabwe -19.01544 29.15486 recovery 0 716 ZW
## 624908 2022-02-07 <NA> Zimbabwe -19.01544 29.15486 recovery 0 716 ZW
## 624909 2022-02-08 <NA> Zimbabwe -19.01544 29.15486 recovery 0 716 ZW
## 624910 2022-02-09 <NA> Zimbabwe -19.01544 29.15486 recovery 0 716 ZW
## 624911 2022-02-10 <NA> Zimbabwe -19.01544 29.15486 recovery 0 716 ZW
## 624912 2022-02-11 <NA> Zimbabwe -19.01544 29.15486 recovery 0 716 ZW
## iso3 code3 combined_key population continent_name continent_code
## 624907 ZWE 716 Zimbabwe 14862927 Africa AF
## 624908 ZWE 716 Zimbabwe 14862927 Africa AF
## 624909 ZWE 716 Zimbabwe 14862927 Africa AF
## 624910 ZWE 716 Zimbabwe 14862927 Africa AF
## 624911 ZWE 716 Zimbabwe 14862927 Africa AF
## 624912 ZWE 716 Zimbabwe 14862927 Africa AF
str(coronavirus)
## 'data.frame': 624912 obs. of 15 variables:
## $ date : Date, format: "2020-01-22" "2020-01-23" ...
## $ province : chr "Alberta" "Alberta" "Alberta" "Alberta" ...
## $ country : chr "Canada" "Canada" "Canada" "Canada" ...
## $ lat : num 53.9 53.9 53.9 53.9 53.9 ...
## $ long : num -117 -117 -117 -117 -117 ...
## $ type : chr "confirmed" "confirmed" "confirmed" "confirmed" ...
## $ cases : int 0 0 0 0 0 0 0 0 0 0 ...
## $ uid : num 12401 12401 12401 12401 12401 ...
## $ iso2 : chr "CA" "CA" "CA" "CA" ...
## $ iso3 : chr "CAN" "CAN" "CAN" "CAN" ...
## $ code3 : num 124 124 124 124 124 124 124 124 124 124 ...
## $ combined_key : chr "Alberta, Canada" "Alberta, Canada" "Alberta, Canada" "Alberta, Canada" ...
## $ population : num 4413146 4413146 4413146 4413146 4413146 ...
## $ continent_name: chr "North America" "North America" "North America" "North America" ...
## $ continent_code: chr "NA" "NA" "NA" "NA" ...
# How many countries are listed in the data?
coronavirus %>%
select(country) %>%
unique() %>%
count()
## n
## 1 198
# Running this code however WON'T count the number of cases
coronavirus %>%
select(cases) %>%
count()
## n
## 1 624912
# Using the the sum() function can tell us total number of cases in the data set. This is possible because the "cases" vector is a numeric
coronavirus %>%
select(cases) %>%
sum()
## [1] 414258063
# coronavirus %>%
# select(type) %>%
# sum()
# How about cases in the US only?
coronavirus %>%
filter(country == "US") %>%
select(cases) %>%
sum()
## [1] 78570648
any(is.na(coronavirus))
## [1] TRUE
# How many?
sum(is.na(coronavirus))
## [1] 560240
# Where are the NA's?
colSums(is.na(coronavirus))
## date province country lat long
## 0 440672 0 3760 3760
## type cases uid iso2 iso3
## 0 0 8272 15040 15040
## code3 combined_key population continent_name continent_code
## 15040 8272 20304 15040 15040
# Location of each NA?
#which(is.na(coronavirus$population))
sum() the observations within a character vector?# Remember, we can only use the sum() function on numerics
# coronavirus %>%
# select(type) %>%
# sum()
# There are three unique observations within the "type" character vector
coronavirus %>%
select(type) %>%
unique()
## type
## 1 confirmed
## 212065 death
## 424129 recovery
# So lets query our data to count "confirmed", "death", and "recovery" observations within the "type" character vector.
coronavirus %>%
group_by(type) %>%
summarise(cases = sum(cases))
## # A tibble: 3 × 2
## type cases
## <chr> <int>
## 1 confirmed 408455835
## 2 death 5802228
## 3 recovery 0
total_cases <- coronavirus %>%
filter(type != "recovery") %>% # "!=" (not equal to)
group_by(type) %>%
summarise(cases = sum(cases)) %>%
mutate(type = factor(type, levels = c("confirmed", "death")))
# Factors are used to represent categorical data which cannot be done with a character vector
total_cases
## # A tibble: 2 × 2
## type cases
## <fct> <int>
## 1 confirmed 408455835
## 2 death 5802228
# Let's begin by grouping our intended observations and summarizing case counts
coronavirus %>%
filter(type != "recovery") %>%
group_by(type, country) %>%
summarise(cases = sum(cases))
## # A tibble: 396 × 3
## # Groups: type [2]
## type country cases
## <chr> <chr> <int>
## 1 confirmed Afghanistan 169940
## 2 confirmed Albania 267020
## 3 confirmed Algeria 260723
## 4 confirmed Andorra 37140
## 5 confirmed Angola 98501
## 6 confirmed Antarctica 11
## 7 confirmed Antigua and Barbuda 7331
## 8 confirmed Argentina 8716940
## 9 confirmed Armenia 402403
## 10 confirmed Australia 2876687
## # … with 386 more rows
pivot_wider() to use observations as column names# Here we will use pivot_wider() to draw out information from our "type" character vector. This will make new columns with "confirmed" and "death" as column names.
coronavirus %>%
filter(type != "recovery") %>%
group_by(type, country) %>%
summarise(cases = sum(cases)) %>%
pivot_wider(names_from = type, values_from = cases) %>%
mutate(death_rate = death / confirmed)
## # A tibble: 198 × 4
## country confirmed death death_rate
## <chr> <int> <int> <dbl>
## 1 Afghanistan 169940 7478 0.0440
## 2 Albania 267020 3402 0.0127
## 3 Algeria 260723 6703 0.0257
## 4 Andorra 37140 148 0.00398
## 5 Angola 98501 1898 0.0193
## 6 Antarctica 11 0 0
## 7 Antigua and Barbuda 7331 134 0.0183
## 8 Argentina 8716940 123859 0.0142
## 9 Armenia 402403 8145 0.0202
## 10 Australia 2876687 4543 0.00158
## # … with 188 more rows
DT package for interactive table# Put it all together with the datatable() function
coronavirus %>%
filter(type != "recovery") %>%
group_by(type, country) %>%
summarise(cases = sum(cases)) %>%
mutate(type = factor(type, levels = c("confirmed", "death"))) %>%
pivot_wider(names_from = type, values_from = cases) %>%
mutate(death_rate = death / confirmed) %>%
arrange(-confirmed) %>%
datatable(rownames = FALSE,
colnames = c("Country", "Confrimed Cases", "Death Cases","Death Rate %")) %>%
formatPercentage("death_rate", 1)
total_cases
## # A tibble: 2 × 2
## type cases
## <fct> <int>
## 1 confirmed 408455835
## 2 death 5802228
# Determine total cases
coronavirus %>%
filter(type == "confirmed") %>% # "==" here is filtering for only the "confirmed" cases
group_by(country) %>%
summarise(total_cases = sum(cases))
## # A tibble: 198 × 2
## country total_cases
## <chr> <int>
## 1 Afghanistan 169940
## 2 Albania 267020
## 3 Algeria 260723
## 4 Andorra 37140
## 5 Angola 98501
## 6 Antarctica 11
## 7 Antigua and Barbuda 7331
## 8 Argentina 8716940
## 9 Armenia 402403
## 10 Australia 2876687
## # … with 188 more rows
# Show % of total cases by creating a new row of total_cases / sum(total_cases)
coronavirus %>%
filter(type == "confirmed") %>%
group_by(country) %>%
summarise(total_cases = sum(cases)) %>%
mutate(percent = total_cases / sum(total_cases))
## # A tibble: 198 × 3
## country total_cases percent
## <chr> <int> <dbl>
## 1 Afghanistan 169940 0.000416
## 2 Albania 267020 0.000654
## 3 Algeria 260723 0.000638
## 4 Andorra 37140 0.0000909
## 5 Angola 98501 0.000241
## 6 Antarctica 11 0.0000000269
## 7 Antigua and Barbuda 7331 0.0000179
## 8 Argentina 8716940 0.0213
## 9 Armenia 402403 0.000985
## 10 Australia 2876687 0.00704
## # … with 188 more rows
DTconfirmed_country <- coronavirus %>%
filter(type == "confirmed") %>%
group_by(country) %>%
summarise(total_cases = sum(cases)) %>%
mutate(percent = total_cases / sum(total_cases)) %>%
arrange(-total_cases) %>%
head(10) %>%
datatable(rownames = FALSE,
colnames = c("Country", "Cases", "Perc of Total")) %>%
formatPercentage("percent", 2)
confirmed_country
treemap_df <- coronavirus %>%
filter(type == "confirmed") %>%
group_by(country) %>%
summarise(total_cases = sum(cases)) %>%
mutate(parents = "Confirmed") %>%
arrange(-total_cases)
plot_ly(data = treemap_df,
type= "treemap",
values = ~total_cases,
labels= ~country,
parents= ~parents,
textinfo="label+value+percent parent")